2aa99ff0ca7a50af99ba59a3e8a54d7a819e7808 angie Thu Aug 26 12:30:42 2021 -0700 Yikes, I was forgetting to expand condensed nodes in the big tree when making custom track VCF for subtrees, resulting in some sequences not being found & having no alt alleles in VCF. diff --git src/hg/hgPhyloPlace/writeCustomTracks.c src/hg/hgPhyloPlace/writeCustomTracks.c index 70b7ff0..fd27450 100644 --- src/hg/hgPhyloPlace/writeCustomTracks.c +++ src/hg/hgPhyloPlace/writeCustomTracks.c @@ -303,74 +303,83 @@ for (s = sampleNames; s != NULL; s = s->next) fprintf(f, "\t%s", s->name); fputc('\n', f); } static struct slRef *slRefListCopyReverse(struct slRef *inList) /* Return a clone of inList but in reverse order. */ { struct slRef *outList = NULL; struct slRef *ref; for (ref = inList; ref != NULL; ref = ref->next) slAddHead(&outList, slRefNew(ref->val)); return outList; } -static void treeToBaseAlleles(struct phyloTree *node, char *refBases, char **sampleBases, - struct hash *sampleToIx, struct slRef *parentMuts) +static void treeToBaseAlleles(struct phyloTree *node, struct hash *condensedNodes, char *refBases, + char **sampleBases, struct hash *sampleToIx, struct slRef *parentMuts) /* Traverse tree, building up an array of reference bases indexed by position, and an array * (indexed by pos) of sample allele arrays indexed by sampleToIx. */ { struct slRef *nodeMuts = parentMuts; if (node->priv != NULL) { // Start out with a copy of parent node mutations in reverse order so we can add at head // and then reverse all. struct singleNucChange *sncs = node->priv; nodeMuts = slRefListCopyReverse(parentMuts); slAddHead(&nodeMuts, slRefNew(sncs)); slReverse(&nodeMuts); } if (node->numEdges > 0) { int i; for (i = 0; i < node->numEdges; i++) - treeToBaseAlleles(node->edges[i], refBases, sampleBases, sampleToIx, nodeMuts); + treeToBaseAlleles(node->edges[i], condensedNodes, refBases, sampleBases, sampleToIx, + nodeMuts); } else { // Leaf node: if in sampleToIx, then for each mutation (beware back-mutations), // set refBase if it has not already been set and set sample allele. - int ix = hashIntValDefault(sampleToIx, node->ident->name, -1); + struct slName *nameList = hashFindVal(condensedNodes, node->ident->name); + if (nameList == NULL) + nameList = slNameNew(node->ident->name); + struct slName *name; + for (name = nameList; name != NULL; name = name->next) + { + int ix = hashIntValDefault(sampleToIx, name->name, -1); if (ix >= 0) { struct slRef *mlRef; for (mlRef = nodeMuts; mlRef != NULL; mlRef = mlRef->next) { struct singleNucChange *snc, *sncs = mlRef->val; for (snc = sncs; snc != NULL; snc = snc->next) { // Don't let a back-mutation overwrite the true ref value: if (!refBases[snc->chromStart]) refBases[snc->chromStart] = snc->refBase; if (sampleBases[snc->chromStart] == NULL) AllocArray(sampleBases[snc->chromStart], sampleToIx->elCount); sampleBases[snc->chromStart][ix] = snc->newBase; } } } } + slFreeList(&nameList); + } if (node->priv != NULL) slFreeList(&nodeMuts); } static void vcfToBaseAlleles(struct vcfFile *vcff, char *refBases, char **sampleBases, struct hash *sampleToIx) /* Build up an array of reference bases indexed by position, and an array (indexed by pos) * of sample allele arrays indexed by sampleToIx, from variants & genotypes in vcff. */ { int gtIxToSampleIx[vcff->genotypeCount]; int gtIx; for (gtIx = 0; gtIx < vcff->genotypeCount; gtIx++) { int ix = hashIntValDefault(sampleToIx, vcff->genotypeIds[gtIx], -1); gtIxToSampleIx[gtIx] = ix; @@ -498,76 +507,77 @@ fprintf(f, ",%c", toupper(altAlleles[altIx])); fputs("\t.\t.\t", f); fprintf(f, "AC=%d", altCounts[0]); for (altIx = 1; altIx < altCount; altIx++) fprintf(f, ",%d", altCounts[altIx]); fprintf(f, ";AN=%d\tGT", sampleCount - noCallCount); int gtIx; for (gtIx = 0; gtIx < sampleCount; gtIx++) fprintf(f, "\t%c", genotypes[gtIx]); fputc('\n', f); } } } static void writeSubtreeVcf(FILE *f, struct hash *sampleToIx, - struct vcfFile *userVcf, struct phyloTree *bigTree) + struct vcfFile *userVcf, struct mutationAnnotatedTree *bigTree) /* Write rows of VCF with genotypes for samples in sampleToIx (with order determined by sampleToIx) * with some samples found in userVcf and the rest found in bigTree which has been annotated * with single-nucleotide variants. */ { char refBases[chromSize]; memset(refBases, 0, sizeof refBases); char *sampleBases[chromSize]; memset(sampleBases, 0, sizeof sampleBases); -treeToBaseAlleles(bigTree, refBases, sampleBases, sampleToIx, NULL); +treeToBaseAlleles(bigTree->tree, bigTree->condensedNodes, refBases, sampleBases, sampleToIx, NULL); vcfToBaseAlleles(userVcf, refBases, sampleBases, sampleToIx); int sampleCount = sampleToIx->elCount; baseAllelesToVcf(refBases, sampleBases, chromSize, sampleCount, f); int i; for (i = 0; i < chromSize; i++) freeMem(sampleBases[i]); } static void addSubtreeCustomTracks(FILE *ctF, char *userVcfFile, struct subtreeInfo *subtreeInfoList, - struct hash *samplePlacements, struct phyloTree *bigTree, + struct hash *samplePlacements, + struct mutationAnnotatedTree *bigTree, char *source, int fontHeight, int *pStartTime) /* For each subtree trashfile, write VCF+treeFile custom track text to ctF. */ { struct vcfFile *userVcf = parseUserVcf(userVcfFile, pStartTime); if (! userVcf) { warn("Problem parsing VCF file with user variants '%s'; can't make subtree subtracks.", userVcfFile); return; } if (!bigTree) return; struct subtreeInfo *ti; for (ti = subtreeInfoList; ti != NULL; ti = ti->next) { writeSubtreeTrackLine(ctF, ti, source, fontHeight); writeVcfHeader(ctF, ti->subtreeNameList); writeSubtreeVcf(ctF, ti->subtreeIdToIx, userVcf, bigTree); fputc('\n', ctF); } vcfFileFree(&userVcf); reportTiming(pStartTime, "write subtree custom tracks"); } struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur, - struct slName *sampleIds, struct phyloTree *bigTree, + struct slName *sampleIds, struct mutationAnnotatedTree *bigTree, char *source, int fontHeight, struct phyloTree **retSampleTree, int *pStartTime) /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ { struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds); struct tempName *ctTn; AllocVar(ctTn); trashDirFile(ctTn, "ct", "ct_pp", ".ct"); FILE *ctF = mustOpen(ctTn->forCgi, "w"); int subtreeCount = slCount(ur->subtreeInfoList); if (subtreeCount <= MAX_SUBTREE_CTS) addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements, bigTree, source, fontHeight, pStartTime); else printf("<p>Subtree custom tracks are added when there are at most %d subtrees, "