0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3 angie Fri Oct 28 15:08:06 2022 -0700 Performance improvements for trees with millions of sequences: * Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher * Use usher-sampled instead of usher if server is not configured but usher-sampled is available * Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running * Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results) * Avoid parsing the protobuf and traversing the big tree unless absolutely necessary ** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c ** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree) ** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new. ** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree diff --git src/hg/hgPhyloPlace/writeCustomTracks.c src/hg/hgPhyloPlace/writeCustomTracks.c index eae53d6..292df91 100644 --- src/hg/hgPhyloPlace/writeCustomTracks.c +++ src/hg/hgPhyloPlace/writeCustomTracks.c @@ -42,31 +42,31 @@ static boolean anyImputedBases(struct hash *samplePlacements, struct slName *sampleIds) /* Return TRUE if any sample has at least one imputed base. */ { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) { struct placementInfo *info = hashFindVal(samplePlacements, sample->name); if (info && info->imputedBases) return TRUE; } return FALSE; } struct tempName *userVcfWithImputedBases(struct tempName *userVcfTn, struct hash *samplePlacements, - struct slName *sampleIds, int chromSize) + struct slName *sampleIds, int chromSize, int *pStartTime) /* If usher imputed the values of ambiguous bases in user's VCF, then swap in * the imputed values and write a new VCF to use as the custom track. Otherwise just use * the user's VCF for the custom track. */ { struct tempName *ctVcfTn = userVcfTn; if (anyImputedBases(samplePlacements, sampleIds)) { AllocVar(ctVcfTn); trashDirFile(ctVcfTn, "ct", "imputed", ".vcf"); FILE *f = mustOpen(ctVcfTn->forCgi, "w"); struct lineFile *lf = lineFileOpen(userVcfTn->forCgi, TRUE); int sampleCount = slCount(sampleIds); int colCount = VCF_NUM_COLS_BEFORE_GENOTYPES + sampleCount; char ***impsByPos = imputedBasesByPosition(samplePlacements, sampleIds, chromSize); char *line; @@ -164,30 +164,31 @@ } } } } int i; fputs(words[0], f); for (i = 1; i < colCount; i++) { fputc('\t', f); fputs(words[i], f); } } fputc('\n', f); } carefulClose(&f); + reportTiming(pStartTime, "Make user VCF with imputed bases"); } return ctVcfTn; } static struct phyloTree *uploadedSamplesTree(char *sampleTreeFile, char *bigTreeFile, struct slName *sampleIds) /* Read in tree from bigTreeFile, prune all nodes that have no leaf descendants in sampleIds, * save pruned tree to sampleTreeFile. */ { struct phyloTree *tree = phyloOpenTree(bigTreeFile); tree = phyloPruneToIds(tree, sampleIds); FILE *f = mustOpen(sampleTreeFile, "w"); phyloPrintTree(tree, f); carefulClose(&f); return tree; @@ -225,35 +226,34 @@ struct lineFile *vcfLf = lineFileOpen(vcfTn->forCgi, TRUE); char *line; while (lineFileNext(vcfLf, &line, NULL)) fprintf(ctF, "%s\n",line); lineFileClose(&vcfLf); reportTiming(pStartTime, "add custom track for uploaded samples"); return sampleTree; } static struct vcfFile *parseUserVcf(char *userVcfFile, int chromSize, int *pStartTime) /* Read in user VCF file and parse genotypes. */ { struct vcfFile *userVcf = vcfFileMayOpen(userVcfFile, NULL, 0, chromSize, 0, chromSize, TRUE); if (! userVcf) return NULL; -reportTiming(pStartTime, "read userVcf"); struct vcfRecord *rec; for (rec = userVcf->records; rec != NULL; rec = rec->next) vcfParseGenotypesGtOnly(rec); -reportTiming(pStartTime, "parse userVcf genotypes"); +reportTiming(pStartTime, "read userVcf and parse genotypes"); return userVcf; } static void writeSubtreeTrackLine(FILE *ctF, struct subtreeInfo *ti, int subtreeNum, char *source, char *geneTrack, int fontHeight) /* Write a custom track line to ctF for one subtree. */ { fprintf(ctF, "track name=subtree%d", subtreeNum); // Keep the description from being too long in order to avoid buffer overflow in hgTrackUi // (and limited space for track title in hgTracks) int descLen = fprintf(ctF, " description='Subtree %d: uploaded sample%s %s", subtreeNum, (slCount(ti->subtreeUserSampleIds) > 1 ? "s" : ""), ti->subtreeUserSampleIds->name); struct slName *id; for (id = ti->subtreeUserSampleIds->next; id != NULL; id = id->next) @@ -285,113 +285,99 @@ for (s = sampleNames; s != NULL; s = s->next) fprintf(f, "\t%s", s->name); fputc('\n', f); } static struct slRef *slRefListCopyReverse(struct slRef *inList) /* Return a clone of inList but in reverse order. */ { struct slRef *outList = NULL; struct slRef *ref; for (ref = inList; ref != NULL; ref = ref->next) slAddHead(&outList, slRefNew(ref->val)); return outList; } -static void treeToBaseAlleles(struct phyloTree *node, struct hash *condensedNodes, char *refBases, +static void treeToBaseAlleles(struct phyloTree *node, char *refBases, char **sampleBases, struct hash *sampleToIx, struct slRef *parentMuts) /* Traverse tree, building up an array of reference bases indexed by position, and an array * (indexed by pos) of sample allele arrays indexed by sampleToIx. */ { struct slRef *nodeMuts = parentMuts; if (node->priv != NULL) { // Start out with a copy of parent node mutations in reverse order so we can add at head // and then reverse all. struct singleNucChange *sncs = node->priv; nodeMuts = slRefListCopyReverse(parentMuts); slAddHead(&nodeMuts, slRefNew(sncs)); slReverse(&nodeMuts); } if (node->numEdges > 0) { int i; for (i = 0; i < node->numEdges; i++) - treeToBaseAlleles(node->edges[i], condensedNodes, refBases, sampleBases, sampleToIx, - nodeMuts); + treeToBaseAlleles(node->edges[i], refBases, sampleBases, sampleToIx, nodeMuts); } else { // Leaf node: if in sampleToIx, then for each mutation (beware back-mutations), // set refBase if it has not already been set and set sample allele. - boolean allocdHere = FALSE; - struct slName *nameList = hashFindVal(condensedNodes, node->ident->name); - if (nameList == NULL) - { - nameList = slNameNew(node->ident->name); - allocdHere = TRUE; - } - struct slName *name; - for (name = nameList; name != NULL; name = name->next) - { - int ix = hashIntValDefault(sampleToIx, name->name, -1); + int ix = hashIntValDefault(sampleToIx, node->ident->name, -1); if (ix >= 0) { struct slRef *mlRef; for (mlRef = nodeMuts; mlRef != NULL; mlRef = mlRef->next) { struct singleNucChange *snc, *sncs = mlRef->val; for (snc = sncs; snc != NULL; snc = snc->next) { // Don't let a back-mutation overwrite the true ref value: if (!refBases[snc->chromStart]) refBases[snc->chromStart] = snc->refBase; if (sampleBases[snc->chromStart] == NULL) AllocArray(sampleBases[snc->chromStart], sampleToIx->elCount); sampleBases[snc->chromStart][ix] = snc->newBase; } } } } - if (allocdHere) - slFreeList(&nameList); - } if (node->priv != NULL) slFreeList(&nodeMuts); } static void vcfToBaseAlleles(struct vcfFile *vcff, char *refBases, char **sampleBases, struct hash *sampleToIx) /* Build up an array of reference bases indexed by position, and an array (indexed by pos) * of sample allele arrays indexed by sampleToIx, from variants & genotypes in vcff. */ { int gtIxToSampleIx[vcff->genotypeCount]; int gtIx; for (gtIx = 0; gtIx < vcff->genotypeCount; gtIx++) { int ix = hashIntValDefault(sampleToIx, vcff->genotypeIds[gtIx], -1); gtIxToSampleIx[gtIx] = ix; } struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { for (gtIx = 0; gtIx < vcff->genotypeCount; gtIx++) { int ix = gtIxToSampleIx[gtIx]; if (ix >= 0) { - // If refBases was already set by bigTree, go with that. + // If refBases was already set by tree, go with that. if (!refBases[rec->chromStart]) refBases[rec->chromStart] = rec->alleles[0][0]; if (sampleBases[rec->chromStart] == NULL) AllocArray(sampleBases[rec->chromStart], sampleToIx->elCount); int alIx = rec->genotypes[gtIx].hapIxA; if (alIx < 0) sampleBases[rec->chromStart][ix] = '.'; else if (alIx > 0) sampleBases[rec->chromStart][ix] = rec->alleles[alIx][0]; // If alIx == 0 (ref), leave the sample base at '\0', the default assumption. } } } } @@ -494,95 +480,92 @@ fprintf(f, ",%c", toupper(altAlleles[altIx])); fputs("\t.\t.\t", f); fprintf(f, "AC=%d", altCounts[0]); for (altIx = 1; altIx < altCount; altIx++) fprintf(f, ",%d", altCounts[altIx]); fprintf(f, ";AN=%d\tGT", sampleCount - noCallCount); int gtIx; for (gtIx = 0; gtIx < sampleCount; gtIx++) fprintf(f, "\t%c", genotypes[gtIx]); fputc('\n', f); } } } static void writeSubtreeVcf(FILE *f, struct hash *sampleToIx, char *chrom, int chromSize, - struct vcfFile *userVcf, struct mutationAnnotatedTree *bigTree) + struct vcfFile *userVcf, struct phyloTree *subtree) /* Write rows of VCF with genotypes for samples in sampleToIx (with order determined by sampleToIx) - * with some samples found in userVcf and the rest found in bigTree which has been annotated + * with some samples found in userVcf and the rest found in subtree which has been annotated * with single-nucleotide variants. */ { char refBases[chromSize]; memset(refBases, 0, sizeof refBases); char *sampleBases[chromSize]; memset(sampleBases, 0, sizeof sampleBases); -treeToBaseAlleles(bigTree->tree, bigTree->condensedNodes, refBases, sampleBases, sampleToIx, NULL); +treeToBaseAlleles(subtree, refBases, sampleBases, sampleToIx, NULL); vcfToBaseAlleles(userVcf, refBases, sampleBases, sampleToIx); int sampleCount = sampleToIx->elCount; baseAllelesToVcf(refBases, sampleBases, chrom, chromSize, sampleCount, f); int i; for (i = 0; i < chromSize; i++) freeMem(sampleBases[i]); } static void addSubtreeCustomTracks(FILE *ctF, char *userVcfFile, struct subtreeInfo *subtreeInfoList, struct hash *samplePlacements, char *chrom, int chromSize, - struct mutationAnnotatedTree *bigTree, char *source, char *geneTrack, int fontHeight, int *pStartTime) /* For each subtree trashfile, write VCF+treeFile custom track text to ctF. */ { struct vcfFile *userVcf = parseUserVcf(userVcfFile, chromSize, pStartTime); if (! userVcf) { warn("Problem parsing VCF file with user variants '%s'; can't make subtree subtracks.", userVcfFile); return; } -if (!bigTree) - return; struct subtreeInfo *ti; int subtreeNum; for (subtreeNum = 1, ti = subtreeInfoList; ti != NULL; ti = ti->next, subtreeNum++) { writeSubtreeTrackLine(ctF, ti, subtreeNum, source, geneTrack, fontHeight); writeVcfHeader(ctF, ti->subtreeNameList); - writeSubtreeVcf(ctF, ti->subtreeIdToIx, chrom, chromSize, userVcf, bigTree); + writeSubtreeVcf(ctF, ti->subtreeIdToIx, chrom, chromSize, userVcf, ti->subtree); fputc('\n', ctF); } vcfFileFree(&userVcf); reportTiming(pStartTime, "write subtree custom tracks"); } struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur, - struct slName *sampleIds, struct mutationAnnotatedTree *bigTree, - char *source, int fontHeight, + struct slName *sampleIds, char *source, int fontHeight, struct phyloTree **retSampleTree, int *pStartTime) /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ { char *chrom = hDefaultChrom(db); int chromSize = hChromSize(db, chrom); char *geneTrack = phyloPlaceDbSetting(db, "geneTrack"); // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. if (isEmpty(geneTrack)) geneTrack = sameString(db, "wuhCor1") ? "ncbiGeneBGP" : "ncbiGene"; struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds, - chromSize); + chromSize, pStartTime); struct tempName *ctTn; AllocVar(ctTn); trashDirFile(ctTn, "ct", "ct_pp", ".ct"); FILE *ctF = mustOpen(ctTn->forCgi, "w"); int subtreeCount = slCount(ur->subtreeInfoList); if (subtreeCount <= MAX_SUBTREE_CTS) addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements, - chrom, chromSize, bigTree, source, geneTrack, fontHeight, pStartTime); + chrom, chromSize, source, geneTrack, fontHeight, pStartTime); else printf("<p>Subtree custom tracks are added when there are at most %d subtrees, " "but %d subtrees were found.</p>\n", MAX_SUBTREE_CTS, subtreeCount); -struct phyloTree *sampleTree = addSampleOnlyCustomTrack(ctF, ctVcfTn, ur->bigTreePlusTn->forCgi, +struct phyloTree *sampleTree = addSampleOnlyCustomTrack(ctF, ctVcfTn, + ur->singleSubtreeInfo->subtreeTn->forCgi, sampleIds, geneTrack, fontHeight, pStartTime); if (retSampleTree) *retSampleTree = sampleTree; carefulClose(&ctF); return ctTn; }