0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3
angie
  Fri Oct 28 15:08:06 2022 -0700
Performance improvements for trees with millions of sequences:
* Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher
* Use usher-sampled instead of usher if server is not configured but usher-sampled is available
* Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running
* Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results)
* Avoid parsing the protobuf and traversing the big tree unless absolutely necessary
** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c
** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree)
** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new.
** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree

diff --git src/hg/hgPhyloPlace/writeCustomTracks.c src/hg/hgPhyloPlace/writeCustomTracks.c
index eae53d6..292df91 100644
--- src/hg/hgPhyloPlace/writeCustomTracks.c
+++ src/hg/hgPhyloPlace/writeCustomTracks.c
@@ -42,31 +42,31 @@
 
 static boolean anyImputedBases(struct hash *samplePlacements, struct slName *sampleIds)
 /* Return TRUE if any sample has at least one imputed base. */
 {
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     {
     struct placementInfo *info = hashFindVal(samplePlacements, sample->name);
     if (info && info->imputedBases)
         return TRUE;
     }
 return FALSE;
 }
 
 struct tempName *userVcfWithImputedBases(struct tempName *userVcfTn, struct hash *samplePlacements,
-                                         struct slName *sampleIds, int chromSize)
+                                         struct slName *sampleIds, int chromSize, int *pStartTime)
 /* If usher imputed the values of ambiguous bases in user's VCF, then swap in
  * the imputed values and write a new VCF to use as the custom track.  Otherwise just use
  * the user's VCF for the custom track. */
 {
 struct tempName *ctVcfTn = userVcfTn;
 if (anyImputedBases(samplePlacements, sampleIds))
     {
     AllocVar(ctVcfTn);
     trashDirFile(ctVcfTn, "ct", "imputed", ".vcf");
     FILE *f = mustOpen(ctVcfTn->forCgi, "w");
     struct lineFile *lf = lineFileOpen(userVcfTn->forCgi, TRUE);
     int sampleCount = slCount(sampleIds);
     int colCount = VCF_NUM_COLS_BEFORE_GENOTYPES + sampleCount;
     char ***impsByPos = imputedBasesByPosition(samplePlacements, sampleIds, chromSize);
     char *line;
@@ -164,30 +164,31 @@
                                 }
                             }
                         }
                 }
             int i;
             fputs(words[0], f);
             for (i = 1;  i < colCount;  i++)
                 {
                 fputc('\t', f);
                 fputs(words[i], f);
                 }
             }
             fputc('\n', f);
         }
     carefulClose(&f);
+    reportTiming(pStartTime, "Make user VCF with imputed bases");
     }
 return ctVcfTn;
 }
 
 static struct phyloTree *uploadedSamplesTree(char *sampleTreeFile, char *bigTreeFile,
                                              struct slName *sampleIds)
 /* Read in tree from bigTreeFile, prune all nodes that have no leaf descendants in sampleIds,
  * save pruned tree to sampleTreeFile. */
 {
 struct phyloTree *tree = phyloOpenTree(bigTreeFile);
 tree = phyloPruneToIds(tree, sampleIds);
 FILE *f = mustOpen(sampleTreeFile, "w");
 phyloPrintTree(tree, f);
 carefulClose(&f);
 return tree;
@@ -225,35 +226,34 @@
 struct lineFile *vcfLf = lineFileOpen(vcfTn->forCgi, TRUE);
 char *line;
 while (lineFileNext(vcfLf, &line, NULL))
     fprintf(ctF, "%s\n",line);
 lineFileClose(&vcfLf);
 reportTiming(pStartTime, "add custom track for uploaded samples");
 return sampleTree;
 }
 
 static struct vcfFile *parseUserVcf(char *userVcfFile, int chromSize, int *pStartTime)
 /* Read in user VCF file and parse genotypes. */
 {
 struct vcfFile *userVcf = vcfFileMayOpen(userVcfFile, NULL, 0, chromSize, 0, chromSize, TRUE);
 if (! userVcf)
     return NULL;
-reportTiming(pStartTime, "read userVcf");
 struct vcfRecord *rec;
 for (rec = userVcf->records;  rec != NULL;  rec = rec->next)
     vcfParseGenotypesGtOnly(rec);
-reportTiming(pStartTime, "parse userVcf genotypes");
+reportTiming(pStartTime, "read userVcf and parse genotypes");
 return userVcf;
 }
 
 static void writeSubtreeTrackLine(FILE *ctF, struct subtreeInfo *ti, int subtreeNum, char *source,
                                   char *geneTrack, int fontHeight)
 /* Write a custom track line to ctF for one subtree. */
 {
 fprintf(ctF, "track name=subtree%d", subtreeNum);
 // Keep the description from being too long in order to avoid buffer overflow in hgTrackUi
 // (and limited space for track title in hgTracks)
 int descLen = fprintf(ctF, " description='Subtree %d: uploaded sample%s %s",
                       subtreeNum, (slCount(ti->subtreeUserSampleIds) > 1 ? "s" : ""),
                       ti->subtreeUserSampleIds->name);
 struct slName *id;
 for (id = ti->subtreeUserSampleIds->next;  id != NULL;  id = id->next)
@@ -285,113 +285,99 @@
 for (s = sampleNames;  s != NULL;  s = s->next)
     fprintf(f, "\t%s", s->name);
 fputc('\n', f);
 }
 
 static struct slRef *slRefListCopyReverse(struct slRef *inList)
 /* Return a clone of inList but in reverse order. */
 {
 struct slRef *outList = NULL;
 struct slRef *ref;
 for (ref = inList;  ref != NULL;  ref = ref->next)
     slAddHead(&outList, slRefNew(ref->val));
 return outList;
 }
 
-static void treeToBaseAlleles(struct phyloTree *node, struct hash *condensedNodes, char *refBases,
+static void treeToBaseAlleles(struct phyloTree *node, char *refBases,
                               char **sampleBases, struct hash *sampleToIx, struct slRef *parentMuts)
 /* Traverse tree, building up an array of reference bases indexed by position, and an array
  * (indexed by pos) of sample allele arrays indexed by sampleToIx. */
 {
 struct slRef *nodeMuts = parentMuts;
 if (node->priv != NULL)
     {
     // Start out with a copy of parent node mutations in reverse order so we can add at head
     // and then reverse all.
     struct singleNucChange *sncs = node->priv;
     nodeMuts = slRefListCopyReverse(parentMuts);
     slAddHead(&nodeMuts, slRefNew(sncs));
     slReverse(&nodeMuts);
     }
 if (node->numEdges > 0)
     {
     int i;
     for (i = 0;  i < node->numEdges;  i++)
-        treeToBaseAlleles(node->edges[i], condensedNodes, refBases, sampleBases, sampleToIx,
-                          nodeMuts);
+        treeToBaseAlleles(node->edges[i], refBases, sampleBases, sampleToIx, nodeMuts);
     }
 else
     {
     // Leaf node: if in sampleToIx, then for each mutation (beware back-mutations),
     // set refBase if it has not already been set and set sample allele.
-    boolean allocdHere = FALSE;
-    struct slName *nameList = hashFindVal(condensedNodes, node->ident->name);
-    if (nameList == NULL)
-        {
-        nameList = slNameNew(node->ident->name);
-        allocdHere = TRUE;
-        }
-    struct slName *name;
-    for (name = nameList;  name != NULL;  name = name->next)
-        {
-        int ix = hashIntValDefault(sampleToIx, name->name, -1);
+    int ix = hashIntValDefault(sampleToIx, node->ident->name, -1);
     if (ix >= 0)
         {
         struct slRef *mlRef;
         for (mlRef = nodeMuts;  mlRef != NULL;  mlRef = mlRef->next)
             {
             struct singleNucChange *snc, *sncs = mlRef->val;
             for (snc = sncs;  snc != NULL;  snc = snc->next)
                 {
                 // Don't let a back-mutation overwrite the true ref value:
                 if (!refBases[snc->chromStart])
                     refBases[snc->chromStart] = snc->refBase;
                 if (sampleBases[snc->chromStart] == NULL)
                     AllocArray(sampleBases[snc->chromStart], sampleToIx->elCount);
                 sampleBases[snc->chromStart][ix] = snc->newBase;
                 }
             }
         }
     }
-    if (allocdHere)
-        slFreeList(&nameList);
-    }
 if (node->priv != NULL)
     slFreeList(&nodeMuts);
 }
 
 static void vcfToBaseAlleles(struct vcfFile *vcff, char *refBases, char **sampleBases,
                              struct hash *sampleToIx)
 /* Build up an array of reference bases indexed by position, and an array (indexed by pos)
  * of sample allele arrays indexed by sampleToIx, from variants & genotypes in vcff. */
 {
 int gtIxToSampleIx[vcff->genotypeCount];
 int gtIx;
 for (gtIx = 0;  gtIx < vcff->genotypeCount;  gtIx++)
     {
     int ix = hashIntValDefault(sampleToIx, vcff->genotypeIds[gtIx], -1);
     gtIxToSampleIx[gtIx] = ix;
     }
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
     for (gtIx = 0;  gtIx < vcff->genotypeCount;  gtIx++)
         {
         int ix = gtIxToSampleIx[gtIx];
         if (ix >= 0)
             {
-            // If refBases was already set by bigTree, go with that.
+            // If refBases was already set by tree, go with that.
             if (!refBases[rec->chromStart])
                 refBases[rec->chromStart] = rec->alleles[0][0];
             if (sampleBases[rec->chromStart] == NULL)
                 AllocArray(sampleBases[rec->chromStart], sampleToIx->elCount);
             int alIx = rec->genotypes[gtIx].hapIxA;
             if (alIx < 0)
                 sampleBases[rec->chromStart][ix] = '.';
             else if (alIx > 0)
                 sampleBases[rec->chromStart][ix] = rec->alleles[alIx][0];
             // If alIx == 0 (ref), leave the sample base at '\0', the default assumption.
             }
         }
     }
 }
 
@@ -494,95 +480,92 @@
         fprintf(f, ",%c", toupper(altAlleles[altIx]));
         fputs("\t.\t.\t", f);
         fprintf(f, "AC=%d", altCounts[0]);
         for (altIx = 1;  altIx < altCount;  altIx++)
             fprintf(f, ",%d", altCounts[altIx]);
         fprintf(f, ";AN=%d\tGT", sampleCount - noCallCount);
         int gtIx;
         for (gtIx = 0;  gtIx < sampleCount;  gtIx++)
             fprintf(f, "\t%c", genotypes[gtIx]);
         fputc('\n', f);
         }
     }
 }
 
 static void writeSubtreeVcf(FILE *f, struct hash *sampleToIx, char *chrom, int chromSize,
-                            struct vcfFile *userVcf, struct mutationAnnotatedTree *bigTree)
+                            struct vcfFile *userVcf, struct phyloTree *subtree)
 /* Write rows of VCF with genotypes for samples in sampleToIx (with order determined by sampleToIx)
- * with some samples found in userVcf and the rest found in bigTree which has been annotated
+ * with some samples found in userVcf and the rest found in subtree which has been annotated
  * with single-nucleotide variants. */
 {
 char refBases[chromSize];
 memset(refBases, 0, sizeof refBases);
 char *sampleBases[chromSize];
 memset(sampleBases, 0, sizeof sampleBases);
-treeToBaseAlleles(bigTree->tree, bigTree->condensedNodes, refBases, sampleBases, sampleToIx, NULL);
+treeToBaseAlleles(subtree, refBases, sampleBases, sampleToIx, NULL);
 vcfToBaseAlleles(userVcf, refBases, sampleBases, sampleToIx);
 int sampleCount = sampleToIx->elCount;
 baseAllelesToVcf(refBases, sampleBases, chrom, chromSize, sampleCount, f);
 int i;
 for (i = 0;  i < chromSize;  i++)
     freeMem(sampleBases[i]);
 }
 
 static void addSubtreeCustomTracks(FILE *ctF, char *userVcfFile, struct subtreeInfo *subtreeInfoList,
                                    struct hash *samplePlacements, char *chrom, int chromSize,
-                                   struct mutationAnnotatedTree *bigTree,
                                    char *source, char *geneTrack, int fontHeight, int *pStartTime)
 /* For each subtree trashfile, write VCF+treeFile custom track text to ctF. */
 {
 struct vcfFile *userVcf = parseUserVcf(userVcfFile, chromSize, pStartTime);
 if (! userVcf)
     {
     warn("Problem parsing VCF file with user variants '%s'; can't make subtree subtracks.",
          userVcfFile);
     return;
     }
-if (!bigTree)
-    return;
 struct subtreeInfo *ti;
 int subtreeNum;
 for (subtreeNum = 1, ti = subtreeInfoList;  ti != NULL;  ti = ti->next, subtreeNum++)
     {
     writeSubtreeTrackLine(ctF, ti, subtreeNum, source, geneTrack, fontHeight);
     writeVcfHeader(ctF, ti->subtreeNameList);
-    writeSubtreeVcf(ctF, ti->subtreeIdToIx, chrom, chromSize, userVcf, bigTree);
+    writeSubtreeVcf(ctF, ti->subtreeIdToIx, chrom, chromSize, userVcf, ti->subtree);
     fputc('\n', ctF);
     }
 vcfFileFree(&userVcf);
 reportTiming(pStartTime, "write subtree custom tracks");
 }
 
 struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur,
-                                   struct slName *sampleIds, struct mutationAnnotatedTree *bigTree,
-                                   char *source, int fontHeight,
+                                   struct slName *sampleIds, char *source, int fontHeight,
                                    struct phyloTree **retSampleTree, int *pStartTime)
 /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */
 {
 char *chrom = hDefaultChrom(db);
 int chromSize = hChromSize(db, chrom);
 char *geneTrack = phyloPlaceDbSetting(db, "geneTrack");
 // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra.
 if (isEmpty(geneTrack))
     geneTrack = sameString(db, "wuhCor1") ? "ncbiGeneBGP" : "ncbiGene";
 struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds,
-                                                   chromSize);
+                                                   chromSize, pStartTime);
 struct tempName *ctTn;
 AllocVar(ctTn);
 trashDirFile(ctTn, "ct", "ct_pp", ".ct");
 FILE *ctF = mustOpen(ctTn->forCgi, "w");
 int subtreeCount = slCount(ur->subtreeInfoList);
 if (subtreeCount <= MAX_SUBTREE_CTS)
     addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements,
-                           chrom, chromSize, bigTree, source, geneTrack, fontHeight, pStartTime);
+                           chrom, chromSize, source, geneTrack, fontHeight, pStartTime);
 else
     printf("<p>Subtree custom tracks are added when there are at most %d subtrees, "
            "but %d subtrees were found.</p>\n",
            MAX_SUBTREE_CTS, subtreeCount);
-struct phyloTree *sampleTree = addSampleOnlyCustomTrack(ctF, ctVcfTn, ur->bigTreePlusTn->forCgi,
+struct phyloTree *sampleTree = addSampleOnlyCustomTrack(ctF, ctVcfTn,
+                                                        ur->singleSubtreeInfo->subtreeTn->forCgi,
                                                         sampleIds, geneTrack, fontHeight,
                                                         pStartTime);
 if (retSampleTree)
     *retSampleTree = sampleTree;
 carefulClose(&ctF);
 return ctTn;
 }